## Replication code for "Family Matters?"
## This is the main merge script (it won't run without non-provided data; see readme file for details)
### Summer 2018
### See the readme file for more details on what goes where in this replication package
### Contact Ariel White with questions: arwhi@mit.edu

rm(list=ls())
setwd("/nfs/projects_nobackup/f/FLvote/Texas/merging")

library(data.table)
library(sp) # Set CRS
library(spdep) # Spatial statistics and definitions
library(rgeos) # buffer and polygon analysis
library(maptools)
library(maps)

load("Harrisdefendants20002012_Arcgeocoded.Rdata")
harrismatch1 <- geodef12flat
dim(harrismatch1)
load("Harrisdefendants20122014_Arcgeocoded.Rdata")
update <- geodef12flat[!(geodef12flat$fyear==2012),]; dim(update); dim(geodef12flat); #drop 2012, to avoid dupes.
harrismatch <- rbind(harrismatch1, update); dim(harrismatch)

#trim down to Harris county addresses? using zipcode http://www.zillow.com/browse/homes/tx/harris-county/
Harriscozips <- c(63362, 77002, 77004, 77003, 77006, 77005, 77008, 77007, 77010, 77009, 77012, 77011, 75032, 77014, 77013, 77016, 63383, 77015, 77018, 77017, 77020, 77019, 77022, 77021, 77024, 77023, 77026, 77025, 77028, 77027, 77030, 77029, 77032, 77031, 77034, 77033, 77036, 77035, 77038, 77037, 77040, 77039, 77042, 77041, 77044, 77043, 77046, 77045, 77048, 77047, 77050, 77049, 77051, 77054, 77053, 77056, 77055, 77058, 28056, 77057, 77060, 77059, 77062, 77061, 77064, 77063, 77066, 77065, 77068, 77067, 77070, 77069, 76048, 77072, 77071, 77074, 77073, 77076, 77075, 77078, 77077, 77080, 77079, 77082, 77081, 77084, 77083, 77086, 77085, 77088, 77087, 77090, 75134, 77089, 77092, 77091, 77094, 78108, 77093, 77096, 77095, 77098, 77099, 77204, 63627, 75160, 77217, 77238, 77249, 77255, 75229, 77266, 77268, 76226, 77306, 77318, 77316, 77325, 76247, 77334, 78266, 77336, 77339, 77338, 77345, 76270, 77346, 77354, 33935, 77356, 77355, 77357, 77362, 77365, 77373, 77375, 77377, 77379, 77381, 77380, 77383, 77382, 77385, 80498, 77384, 77386, 77389, 77388, 77391, 77396, 77401, 76437, 77406, 77410, 77423, 77429, 77433, 77441, 76472, 77445, 77447, 77450, 77449, 77459, 76513, 77469, 77471, 77478, 77477, 77479, 77482, 77484, 77489, 77493, 77494, 77503, 77502, 77505, 77504, 36862, 77507, 77506, 77510, 77514, 77521, 77520, 77530, 77532, 77531, 77523, 77407, 77536, 77498, 77535, 77539, 79708, 77546, 77545, 77547, 77554, 77562, 77571, 77573, 77578, 77581, 77584, 77583, 77586, 77587, 77590, 77598, 77650, 77663, 75758, 78669, 79938, 77845, 78734)
dim(harrismatch)
harrismatch <- harrismatch[harrismatch$ARC_ZIP %in% Harriscozips, ] #this should also trim out most non-matches for now.
harrismatch <- harrismatch[harrismatch$Score >48,] #keep match score cutoff pretty low for now
harrismatch <- harrismatch[harrismatch$crt>15,] # keep only felony cases
dim(harrismatch)
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")

#clean up/subset as needed.
colnames(harrismatch)[colnames(harrismatch) == "coords.x2"] <- "Latitude" 
colnames(harrismatch)[colnames(harrismatch) == "coords.x1"] <- "Longitude" 
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")

#give defendants coordinates
harrismatch_unproj <- harrismatch #keep one non-projected
coordinates(harrismatch) <- c("Longitude", "Latitude")

proj4string(harrismatch) <- CRS(" +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") #see earlier file: when I pulled the geocoded voters in as a shapefile from Arc, this is the projection it had.
#switch to meters
harrismatchproj <- spTransform(harrismatch,CRS(" +proj=utm +zone=14 +ellps=WGS84 +datum=WGS84 +units=m +no_defs +towgs84=0,0,0")) 

#voters
library(data.table)
load("Harrisvoters2013_Arcgeocoded.Rdata")

harrisproj <- fullfile[fullfile$Status=="M",]; dim(fullfile); dim(harrisproj) #matched to an address
setnames(harrisproj, "Y",  "Latitude")
setnames(harrisproj, "X", "Longitude")
head(harrisproj$Longitude) 
harrisdf <-harrisproj #keep one plain copy

rm(fullfile)
coordinates(harrisproj) <- c("Longitude", "Latitude")
proj4string(harrisproj) <- CRS(" +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") 
harrisproj <- spTransform(harrisproj,CRS(" +proj=utm +zone=14 +ellps=WGS84 +datum=WGS84 +units=m +no_defs +towgs84=0,0,0")) 

#just restrict to zip codes and then look within those (for efficiency)
harrisproj$ARCZIPCHAR <- as.character(harrisproj$ARC_ZIP)
harrismatchproj$ARCZIPCHAR <- as.character(harrismatchproj$ARC_ZIP)


voterlistlist  <- list() #create an empty list (that will contain other lists)

harrismatchproj1 <- harrismatchproj[order(harrismatchproj$ARCZIPCHAR),]
uniquezips <- unique(harrismatchproj1$ARCZIPCHAR)
# keep only the necessary columns and merge in the rest later for speed
keep <- c("coordinates", "coords.x1", "coords.x2", "state_file_id", "ARCZIPCHAR")
harrisproj1 <- harrisproj[, (names(harrisproj) %in% keep)]

#set up function to replace the inner loop (where we go through all the defendants in the zipcode)
buffer <- function(i, smallerdefs, smallervoters){
		pointproj <- smallerdefs[i,] 
		votebuffer.small <- gBuffer(pointproj, width=3, byid=TRUE) #make the single-point buffer around the defendant
		voters.inside <- as.data.frame(smallervoters[which(gContains(votebuffer.small, smallervoters, byid=TRUE)),]) #find voters in the buffer
		if(nrow(voters.inside)>=1){ #if there are any, figure out distances and stick them in the list
			points <- voters.inside
			coordinates(points) <-  c("coords.x1", "coords.x2")
			point2 <- as.data.frame(pointproj)
			coordinates(point2) <- c("X", "Y") #just use lat/long for this calc.
			voters.inside$distances  <- spDistsN1(points,point2)  #now figure out how far each is from the neighbor.
			voters.inside$def_spn <- pointproj$def_spn
			return(voters.inside)}
	}


library(doParallel)
cl <- makeCluster(12)
registerDoParallel(cl)
voterlistlist  <- list() #create an empty list (that will contain other lists)
strt <- Sys.time()

outerloop <- function(j, harrisproj1, harrismatchproj1, uniquezips){
	zip <- uniquezips[j]
	smallervoters <- harrisproj1[harrisproj1$ARCZIPCHAR==zip,] #limit to voters in the same zip code before searching.
	smallerdefs <- harrismatchproj1[harrismatchproj1$ARCZIPCHAR==zip,] #same deal for defendants.
	over <- as.data.frame(c(1:nrow(smallerdefs)))
	voterlist <- foreach(i=1:nrow(smallerdefs)) %do% buffer(i, smallerdefs, smallervoters)
	return(voterlist) #in here is where I stick the whole voter list from this zipcode into the bigger list of lists.
}

voterlistlist <- foreach(j=1:length(uniquezips), .packages=c('rgeos', 'sp', 'spdep', 'rgdal', 'foreach')) %dopar% (
	outerloop(j, harrisproj1, harrismatchproj1, uniquezips)
)

class(voterlistlist)

voterl3 <- list()
voterlisttrim <- voterlistlist[!sapply(voterlistlist, is.null)]
for(k in 1:length(voterlisttrim)){
	voterl3[[k]] <- do.call("rbind",voterlisttrim[[k]]) #unzip one layer of lists
}
d <- do.call("rbind",voterl3)
print(Sys.time()-strt)

df1 <- d

#now merge the full voter file back in.
dim(df1)
summary(df1$distances)

df2 <- merge(df1, harrisproj, by.x="state_file_id", by.y="state_file_id", all.x=T)
dim(df1); dim(harrisproj); dim(df2)

neighborsdist <- df2
neighborsdist$meters <- (neighborsdist$distances)*1000
summary(neighborsdist$distances); summary(neighborsdist$meters)
allclose <- neighborsdist[neighborsdist$distances<=.005 & is.na(neighborsdist$distances) == F,] # hmm.
summary(allclose$distances)
dim(allclose); dim(neighborsdist)

#keep everyone close to a defendant with <10 voters at an address.
#take that df, merge it to the court records data (for crt asst, sentencing)
# also drop anyone (defendant) who appears to be themselves in the voter file
closeones <- allclose 
length(unique(closeones$def_spn)) 

library(data.table)
library(sp) 
library(spdep) 
library(rgeos) 
library(maptools)
library(maps)

load("Harrisdefendants20002012_Arcgeocoded.Rdata")
harrismatch1 <- geodef12flat
dim(harrismatch1)
load("Harrisdefendants20122014_Arcgeocoded.Rdata")
update <- geodef12flat[!(geodef12flat$fyear==2012),]; dim(update); dim(geodef12flat); #drop 2012, to avoid duplication.
harrismatch <- rbind(harrismatch1, update); dim(harrismatch)

#trim down to Harris county addresses? using zipcode http://www.zillow.com/browse/homes/tx/harris-county/
Harriscozips <- c(63362, 77002, 77004, 77003, 77006, 77005, 77008, 77007, 77010, 77009, 77012, 77011, 75032, 77014, 77013, 77016, 63383, 77015, 77018, 77017, 77020, 77019, 77022, 77021, 77024, 77023, 77026, 77025, 77028, 77027, 77030, 77029, 77032, 77031, 77034, 77033, 77036, 77035, 77038, 77037, 77040, 77039, 77042, 77041, 77044, 77043, 77046, 77045, 77048, 77047, 77050, 77049, 77051, 77054, 77053, 77056, 77055, 77058, 28056, 77057, 77060, 77059, 77062, 77061, 77064, 77063, 77066, 77065, 77068, 77067, 77070, 77069, 76048, 77072, 77071, 77074, 77073, 77076, 77075, 77078, 77077, 77080, 77079, 77082, 77081, 77084, 77083, 77086, 77085, 77088, 77087, 77090, 75134, 77089, 77092, 77091, 77094, 78108, 77093, 77096, 77095, 77098, 77099, 77204, 63627, 75160, 77217, 77238, 77249, 77255, 75229, 77266, 77268, 76226, 77306, 77318, 77316, 77325, 76247, 77334, 78266, 77336, 77339, 77338, 77345, 76270, 77346, 77354, 33935, 77356, 77355, 77357, 77362, 77365, 77373, 77375, 77377, 77379, 77381, 77380, 77383, 77382, 77385, 80498, 77384, 77386, 77389, 77388, 77391, 77396, 77401, 76437, 77406, 77410, 77423, 77429, 77433, 77441, 76472, 77445, 77447, 77450, 77449, 77459, 76513, 77469, 77471, 77478, 77477, 77479, 77482, 77484, 77489, 77493, 77494, 77503, 77502, 77505, 77504, 36862, 77507, 77506, 77510, 77514, 77521, 77520, 77530, 77532, 77531, 77523, 77407, 77536, 77498, 77535, 77539, 79708, 77546, 77545, 77547, 77554, 77562, 77571, 77573, 77578, 77581, 77584, 77583, 77586, 77587, 77590, 77598, 77650, 77663, 75758, 78669, 79938, 77845, 78734)
dim(harrismatch)
harrismatch <- harrismatch[harrismatch$ARC_ZIP %in% Harriscozips, ] 
harrismatch <- harrismatch[harrismatch$Score >48,] #drop all non-matched adds (but keep score cutoff pretty low for now)
harrismatch <- harrismatch[harrismatch$crt>15,]  #keep only felony cases
dim(harrismatch)

#clean up court records
colnames(harrismatch)[colnames(harrismatch) == "coords.x2"] <- "Latitude" 
colnames(harrismatch)[colnames(harrismatch) == "coords.x1"] <- "Longitude" 
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")
dim(harrismatch); length(unique(harrismatch$def_spn)) #few dozen dupes due to weird cases (like people ending up with different cases filed exact same day in two courts)-- drop them
harrismatch1a <-harrismatch[!(duplicated(harrismatch$def_spn) | duplicated(harrismatch$def_spn, fromLast = TRUE)), ]
harrismatch1 <- as.data.table(harrismatch1a) 

closeones_m1 <- merge(closeones, harrismatch1, by.x="def_spn", by.y="def_spn", all.x=T)
dim(closeones_m1); dim(closeones)
dim(harrismatch1); length(unique(harrismatch1$def_spn)) 

matnbors <- as.data.table(closeones_m1)
mneighbors <- matnbors[crt>15] #felony cases only
dim(mneighbors)

#now drop defendants who are themselves in the voter file
library(lubridate)
mneighbors$mergelname <- trimws(toupper(as.character(mneighbors$last_name))) #strip out leading/trailing spaces just in case
mneighbors[,mergeDOB := ymd(as.character(born_at))]
mneighbors[, defendantDOB := ymd(def_dob)] #clean up defendant birthdates to match.
#am looking for fname/lastname in def. name field, plus matched YOB
mneighbors[,defendantYOB:=format(defendantDOB, "%Y")]
mneighbors[,voterYOB:=format(mergeDOB, "%Y")]
head(mneighbors$voterYOB)

testFunc <- function(votername, defname) grepl(votername, defname, fixed=T)
mneighbors$samefname <- apply(mneighbors[,c('first_name','def_nam')], 1, function(y) testFunc(y['first_name'],y['def_nam'])) #is voter first name in def_name field?
mneighbors$samelname <- apply(mneighbors[,c('last_name','def_nam')], 1, function(y) testFunc(y['last_name'],y['def_nam'])) #same for last name
mneighbors[, voterdef:=0]
mneighbors[samefname==1 & samelname==1 & defendantYOB==voterYOB, voterdef:=1] #count as voter if matches 1st/last name and YOB.
mnbors <- mneighbors[voterdef!=1,] #drop defendants from the file.
dim(mneighbors)-dim(mnbors) 

##run a quick check to see set difference: should we have dropped the people we dropped?
#setkey(mneighbors, "state_file_id")
#setkey(mnbors, "state_file_id")
#dropvoters <- mneighbors[!mnbors]
#dim(dropvoters)
#head(dropvoters) #these look correct.

#set up turnout measure 2012 (and hist) and some defendant-derived covars
mnbors[, male := NA]; mnbors[def_sex=="M", male := 1]; mnbors[def_sex=="F", male := 0]
mnbors[, black := NA]; mnbors[def_rac == "B", black:= 1]; 
mnbors[(is.na(def_rac)==F) & (def_rac != "B"), black := 0]
mnbors[ageatfile < 10950,over30 := 0]
mnbors[ageatfile >= 10950,over30 := 1]

#also clean up whatever voter covars we have from the voter file.
#note that "gender" is from the voter file - it's a factor, could change if wanted.
electionday2012 <- ymd("2012-11-06")
mnbors[, voter_age:= (electionday2012 - mergeDOB)/365.25] #how old was the voter on election day 2012?
mnbors[, voter_male:=NA]
mnbors[gender=="F", voter_male:=0]
mnbors[gender=="M", voter_male:=1]

mnbors[,count:=1]
mnbors[, numberatpoint := sum(count), by=Match_addr.y]
smallneighbors <- mnbors[numberatpoint <10,] ; dim(smallneighbors) 
#quick check of size at different cutoffs: 
smallneighbors8 <- mnbors[numberatpoint <8,] ; nrow(smallneighbors8) 
smallneighbors12 <- mnbors[numberatpoint <12,] ; nrow(smallneighbors12) 
smallneighbors20 <- mnbors[numberatpoint <20,] ; nrow(smallneighbors20)

(nrow(smallneighbors)- nrow(smallneighbors8))/nrow(smallneighbors)
(nrow(smallneighbors)- nrow(smallneighbors12))/nrow(smallneighbors)
(nrow(smallneighbors)- nrow(smallneighbors20))/nrow(smallneighbors)

###########################################################################
#now merge in the updated 2014 voter file to fill in vote history 2012.
load("fullTXfile2014_minimal.Rdata")
dim(fullfiletrim) #this is the full TX file from 2014; I cut it down to just a couple of columns for this so it's not so huge to load in.
fullfiletrim[, state_file_id := as.numeric(state_file_id)]
small1 <- merge(smallneighbors, fullfiletrim, by="state_file_id"); dim(small1); dim(smallneighbors); dim(fullfiletrim) #some people dropped off.
small1 <- merge(smallneighbors, fullfiletrim, by="state_file_id" , all.x=T);dim(small1); dim(smallneighbors); dim(fullfiletrim)
smallneighbors <- small1 

smallneighbors[, vote2012 := 0] #voting
smallneighbors[vh12g1>0 & is.na(vh12g1)==F, vote2012 := 1]
sum(smallneighbors$vote2012, na.rm=T) 

#but actually use the mid-2012 file for earlier voting, since it's more complete.
smallneighbors[, vote2010 := 0]
smallneighbors[general_2010>0 & is.na(general_2010)==F, vote2010 := 1]
sum(smallneighbors$vote2010, na.rm=T) 

smallneighbors[, vote2008 := 0]
smallneighbors[general_2008>0 & is.na(general_2008)==F, vote2008 := 1]
sum(smallneighbors$vote2008, na.rm=T) 

smallneighbors[, vote2006 := 0]
smallneighbors[general_2006>0 & is.na(general_2006)==F, vote2006 := 1]
sum(smallneighbors$vote2006, na.rm=T) 

smallneighbors[, vote2004 := 0]
smallneighbors[general_2004>0 & is.na(general_2004)==F, vote2004 := 1]
sum(smallneighbors$vote2004, na.rm=T) 

#cut this down a little at first
smallneighbors$firstcase <- ymd(smallneighbors$firstcased)
dim(smallneighbors)
dim(smallneighbors)

smallneighborsfull <- smallneighbors

#keep best defendant match.
allDup <- function (value) {duplicated(value) | duplicated(value, fromLast = TRUE)} 
smallneighborsfull[,dupe := 0]; smallneighborsfull[allDup(smallneighborsfull$state_file_id), dupe:=1];table(smallneighborsfull$dupe) #find all duplicates
smallneighborsfull[,defmatchsc := 100-Score.y] #reverse scoring
nondupes <- smallneighborsfull[dupe==0,]; dupesall <- smallneighborsfull[dupe==1,]
setkey(dupesall, state_file_id, defmatchsc)
deduped <- dupesall[!duplicated(dupesall$state_file_id),] #drop everything but the first (and best-matched) voter obs
smallneighborsfull <- rbind(nondupes, deduped)
dim(smallneighborsfull)

smallneighborsfull2 <- smallneighborsfull 

smallneighborsfull2[, samename:=0] #and mark whether or not the matched household shares a last name (for robustness test below) 
testFunc <- function(votername, defname) grepl(votername, defname, fixed=T)
smallneighborsfull2$samename <- apply(smallneighborsfull2[,c('mergelname','def_nam')], 1, function(y) testFunc(y['mergelname'],y['def_nam']))
sum(smallneighborsfull2$samename)
samename <- smallneighborsfull2[samename==1,] 
dim(samename); dim(smallneighborsfull2)
dim(samename)/dim(smallneighborsfull2)#just under 40%

smallneighborsfull2[, agediff:= as.numeric(abs(defendantDOB - mergeDOB)/365.25)] #calculate age difference in years (absolute)
smallneighborsfull2[, agediff1:= as.numeric(defendantDOB- mergeDOB)/365.25] #calculate age difference in years (defendant goes first because that DOB should be "bigger"/more recent")


#not merging in homeowners for this version. 

#also: output a list of households matched to people facing felony charges, to allow for the "dropping felony cases" analysis in the SI.
sn0914 <- smallneighborsfull2[firstcase >  "2008-11-04",] #everybody after 2008 election, including placebo test stuff.
sn0914<- subset(sn0914, select=c("state_file_id", "firstcase", "crt")) #just keep what I really need to match back & exclude
save(sn0914, file="HHsmatched0914.Rdata")

########################################################################
## generate the de-identified dataset to be used for the felony analysis
########################################################################

## need to keep def. identifier for clustering, but don't need it to be actual court system def_spn-- hash it
library(digest)
vdigest <- Vectorize(digest)
smallneighborsfull2$hashedID <- vdigest(smallneighborsfull2$def_spn)

colnames(smallneighborsfull2)
keeps <- c("residential_zip5", "def_rac", "def_sex", "ageatfile", "fyear", "firstcase", "totalsente", "anyjail", "anyconv", "anyfine", "anyprobati", "mostsevcha", "male", "black", "over30", "voter_age", "voter_male", "numberatpoint", "vote2012", "vote2008", "vote2004","vote2010", "vote2006", "samename", "hashedID", "agediff", "agediff1") 

deid <-  subset(smallneighborsfull2, select=keeps)
setnames(deid, "hashedID", "def_spn") #rename the hashed version to the original column name so all the original code works. 

smallneighborsfull2 <- deid

save(smallneighborsfull2, file="felonycases_householdsmerged_deidentified.Rdata") 




